Purpose

Format the ird_seqs in the same way as the new_seqs data:

  • seq.id should be: A/strain/name/here/2009|gene

In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

In [2]:
# Read in the new 79 sequences.
new_seqs = [s for s in SeqIO.parse('Alaska_waterfowl_79viruses_seqs_20151223.fasta', 'fasta')]
len(new_seqs)


Out[2]:
647

In [3]:
len(new_seqs) / 8


Out[3]:
80.875

In [4]:
ird_data = pd.read_csv('20160104_brandt_ird.csv', na_filter=False, parse_dates=['Collection Date'])
ird_data['Host Species'] = ird_data['Host Species'].str.split(':').str[1]
ird_data['Strain Name'] = ird_data['Strain Name'].str.split('(').str[0]
ird_data['Strain Name'] = ird_data['Strain Name'].str.replace('A/American black duck/Maine/44411/532/2008', 'A/American black duck/Maine/44411_532/2008') ## this is a manual data cleanup.
ird_data['State/Province'] = ird_data['State/Province'].replace('Unknown', np.nan).replace('-N/A-', np.nan)
ird_data['Sequence Accession'] = ird_data['Sequence Accession'].str.replace('*', '')

In [14]:
len(ird_data[ird_data['State/Province'] == 'Alaska']) / 8


Out[14]:
689.0

In [6]:
new_data = pd.read_csv('Alaska_waterfowl_79viruses_metadata_20151223.csv')
new_data.columns


Out[6]:
Index(['Strain_name', 'Subtype', 'Blinded Number',
       'Organism Name provided by collaborator',
       'Updated Organism Name (names that changed are in blue)',
       'Special Note_JCVIs', 'Complete/Draft', 'CEIRS Sample ID', 'UPDATED?',
       'Age', 'Age_Brandt', 'Age_final', 'Sex', 'Species', 'SPECIES_CORRECT',
       'Species_final', 'Band#', 'Webtag#', 'BroodID', 'NestID',
       'CollectionDate', 'Latitude', 'Longitude', 'SampleType', 'Unnamed: 24',
       'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28',
       'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32',
       'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35'],
      dtype='object')

In [7]:
segnum_name = dict()
segnum_name[1] = 'PB2'
segnum_name[2] = 'PB1'
segnum_name[3] = 'PA'
segnum_name[4] = 'HA'
segnum_name[5] = 'NP'
segnum_name[6] = 'NA'
segnum_name[7] = 'M'
segnum_name[8] = 'NS'

In [15]:
ird_seqs = [s for s in SeqIO.parse('20160104_brandt_ird.fasta', 'fasta')]

def change_id_name(s, ird_data, segnum_name):
    """
    s: a BioPython seqrecord
    ird_data: the data downloaded from the IRD
    """
# for s in ird_seqs:
    try:
        row = ird_data.loc[ird_data['Sequence Accession'] == s.id]
        idx = row.index[0]
        strain_name = row['Strain Name'][idx]
        gene_name = segnum_name[row['Segment'][idx]]
        s.id = str(strain_name) + '|' + str(gene_name)
        s.id = s.id.replace(' ', '_')
    except:
        print(s)
    return s 

results = Parallel(n_jobs=-1)(delayed(change_id_name)(s, ird_data, segnum_name) for s in ird_seqs)
print(len(results) / 8)


2733.5

In [16]:
# Combine the FASTA files together into one unfiltered FASTA file.
SeqIO.write(results, 'ird_seqs_name_as_accession.fasta', 'fasta')


Out[16]:
21868

In [ ]:


In [ ]: